/***************************************************************************************
Firm-embedded productivity and cross-country income differences
Alviarez, Cravino and Ramondo
Journal of Political Economy (2022)

Program: prep_01_forreg.do
Date: October 2022

Description: Constructiong bilateral shares at the level of the firm, sin(phi)

*****************************************************************************************/

*-------------------------------------------------------------------------------
global typeden=1
include "set_directories.do"
set memory 64g
global lf "LF"
*-------------------------------------------------------------------------------

clear all
set more off
local var0 sales /*Key variable of interest*/
local varset sales emp exports va labcost /*all relevant variables*/
local types "naics naics4"


etime, start
foreach type in `types' {

display "`type'"

use year isocode id_bvd guo_bvd hq `varset' sector* NAICS* con_code using "${data}/firm_allyears_sales.dta", clear 
drop if `var0'==. | `var0'==0
tab con_code
drop if con_code=="$lf"
drop con_code
tab sector


*Bring the age of the firm*
*-----------------------------------------------------
gen bvdidnumber=id_bvd
foreach xx in year_1_1990  year_1990_2000 year_2000_2005 year_2005_2010 year_2010_2015 year_2015 {
display "`xx'"
merge m:1 bvdidnumber using "${data}/bvdid_incorportation_`xx'.dta", keepusing(year_unit) update
drop if _merge==2
drop _merge 
}
gen firm_age=year-year_unit
sum firm_age, d
$conditionforage 


*Choosing the industry level 
*-----------------------------------------------------
if "`type'"=="naics4" {
drop if sector=="NA"
tab sector
local industry NAICS4 
}
if "`type'"=="naics3" {
drop if sector=="NA"
tab sector
local industry NAICS3 
}
if "`type'"=="naics" {
drop if sector=="NA"
tab sector
local industry sector 
}
if "`type'"=="market" {
keep if sector1=="Manufacturing (C)" | sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" | sector1=="Other_Goods (A-B-D-E-F)" 
drop sector*
gen sector="Market (MARKT)"
gen sector1=sector
local industry sector 
}
if "`type'"=="MS" {
keep if sector1=="Manufacturing (C)" |  sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" 
drop sector*
gen sector="MS"
gen sector1=sector
local industry sector 
}
if "`type'"=="manuf" {
keep if sector1=="Manufacturing (C)" 
drop sector*
gen sector="Manufacturing (C)"
gen sector1=sector
local industry sector 
}
if "`type'"=="serv" {
keep if sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" 
drop sector*
gen sector="Market_Services (G-H-I-J-K-M-N-R-S-T)"
gen sector1=sector
local industry sector 
}
*

*Collapse at the level of the parent-industry-isocode (this includes the sales of the GUO-isocode at home as well as in other countries)
*-----------------------------------------------------
sort year isocode sector1 sector `industry' hq guo_bvd
collapse (sum) `varset' (max) firm_age, by(year isocode sector1 sector `industry' hq guo_bvd)


*Keep only MNCs in the sample based on the number of countries (across ALL SECTORS) 
*----------------------------------------------------- 
by year guo_bvd isocode, sort: gen a=_n==1
tab a
by year guo_bvd, sort: egen num_iso=total(a)
drop a

gen MNC=0
replace MNC=1 if num_iso>1
tab MNC
tab MNC if isocode!=hq
drop if MNC==0 & isocode==hq /*Notice that The above line keeps firms with only one affiliate but in a foreign country.*/
tab MNC
drop if MNC==0 /*Nonetheless for identification an MNCs need to be in at least 2 foreign countries to be in the sample*/




*Rename each variable of interest in the dataset
*----------------------------------------------------- 
foreach vv of local varset {
display "`vv'"
rename `vv' sin_`vv'
}
sort year hq isocode
keep if sin_`var0'!=. & sin_`var0'!=0 
compress



*Bring aggregate Klems-OECD data --- Xn, Xnn, Xn-exp (this files were prepared in 'aggregate_data_sales.do' files ----
*-----------------------------------------------------
merge m:1 year sector isocode using "${data}/klems_oecd_unido_orbis_sales_emp_exp.dta", keepusing(EMP* GO_usd GO_usd_exp VA_usd exports LAB_usd data_source) 
keep if year>=2005 & year<=2017
drop if _merge==2 
drop _merge

*Rename KLEMS/OECD variables  
gen Xnn2=GO_usd
rename VA_usd yn_va_ko
rename EMPE yn_emp_ko
rename exports yn_exp_ko
rename GO_usd yn_`var0'_ko
rename LAB_usd yn_labcost_ko
rename Xnn2 ynn_`var0'_ko
rename GO_usd_exp yn_`var0'_exp_ko
format %9.0fc yn_`var0'_ko ynn_`var0'_ko yn_`var0'_exp_ko yn_exp_ko yn_va_ko yn_labcost_ko

tempfile t0
save `t0', replace


*Keep only the aggregate data and then bring it to the firm level data
*-----------------------------------------------------
use `t0', clear
keep year isocode sector1 sector `industry' yn_`var0'_ko yn_emp_ko  yn_va_ko ynn_`var0'_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko 
duplicates drop 
collapse (sum) yn_`var0'_ko yn_emp_ko yn_va_ko ynn_`var0'_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko, by(year isocode sector1 sector `industry')
tempfile t1
save `t1', replace

use `t0', clear
collapse  (sum) sin_sales sin_emp sin_va sin_exp sin_labcost (max) firm_age, by(year guo_bvd isocode hq  sector1 sector `industry')
merge m:1 year isocode sector1 sector `industry' using `t1'
drop _merge

foreach i in sin_sales sin_emp sin_va sin_exp sin_labcost yn_`var0'_ko yn_emp_ko ynn_`var0'_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko  {
replace `i'=. if `i'==0
}
tempfile final
save `final', replace


*Construct the relevant shares using aggregate KLEMS/OECD
*-----------------------------------------------------
use `final', clear

display "Drop if revenues are below 100,000 USD" 
sum sin_`var0', d
sum sin_`var0' if sin_`var0'<0.1 
drop if sin_`var0'<0.1 

gen sin_emp_ko=sin_emp/yn_emp_ko
gen sin_va_ko=sin_va/yn_va_ko
gen sin_labcost_ko=sin_labcost/yn_labcost_ko
gen sin_`var0'_ko=sin_`var0'/yn_`var0'_ko
gen sin_`var0'_local_ko=sin_`var0'/ynn_`var0'_ko
gen sin_`var0'_exp_ko=(sin_`var0'-sin_exp)/yn_`var0'_exp_ko

*Drop observations outside the range (only happens for HK, SG and JP)
drop if sin_sales_ko>=1 | sin_sales_ko<=0 | sin_sales_ko==.
replace sin_emp_ko=. if sin_emp_ko>=1 | sin_emp_ko<=0
replace sin_va_ko=. if sin_va_ko>=1 | sin_va_ko<=0
replace sin_labcost_ko=. if sin_labcost_ko>=1 | sin_labcost_ko<=0
replace sin_`var0'_exp_ko=. if sin_`var0'_exp_ko>=1 | sin_`var0'_exp_ko<=0


*Keep only MNC (two countries, which includes two or more affiliates) in a given sector -- notice this is more demanding that the applied restrictions above, but necessary for the sectoral level regressions
*----------------------------------------------
by year sector guo_bvd isocode, sort: gen a=_n==1
by year sector guo_bvd, sort: egen b=total(a)
tab b 
keep if b>=2
drop a b 
format %9.4fc sin* 
format %9.0fc *`var0' yn* 
sort sector1 sector `industry' hq isocode
order sector1 sector `industry' hq isocode sin_`var0'_ko sin_`var0'_local_ko sin_`var0'_exp_ko 


*Create percentiles 
*----------------------------------------------
foreach pos in 10 20 30 40 50 60 70 80 90 {
display `pos'
gsort year isocode sector1 sector `industry' -sin_sales
by year isocode sector1 sector `industry': egen p`pos'_sales = pctile(sin_sales), p(`pos')
format %9.3fc p`pos'_sales
label var p`pos'_sales "pctile `pos' of sin_sales"
}
*


*Number of countries in which the affiliate operates (outside home)
*----------------------------------------------
sort year sector1 `industry' hq guo_bvd isocode
by year sector1 `industry' hq guo_bvd isocode, sort: gen a=_n==1
replace a=0 if isocode==hq
by year sector1 `industry' hq guo_bvd, sort: egen num_ctry=total(a)
drop a
label var num_ctry "number of foreign countries GUO operates within `type'"

tempfile LCS
save `LCS', replace


*Firms in the Largest Connected Set 
*----------------------------------------------
use `LCS', clear
keep year `industry' guo_bvd isocode iso sin_`var0' sin_`var0'_ko yn_`var0'_ko
duplicates drop
sort year isocode guo_bvd
egen iso=group(isocode)
egen guo=group(guo_bvd)
egen sss=group(`industry')
order year `industry' guo_bvd isocode sss guo iso sin_`var0' sin_`var0'_ko yn_`var0'_ko
sort year sss iso guo 
keep if isocode!=""
keep if isocode=="JP" | isocode=="IT" | isocode=="KR" | isocode=="DE" | isocode=="FR" | isocode=="ES" |  isocode=="MX" |  isocode=="GB" |  isocode=="PL" |  isocode=="GR" |  isocode=="NL" |  isocode=="RO" |  isocode=="BE" |  isocode=="AT" |  isocode=="PT" |  isocode=="FI" |  isocode=="SE" |  isocode=="CZ" |  isocode=="SK" |  isocode=="BG" |  isocode=="HU" |  isocode=="HR" |  isocode=="LV" |  isocode=="SI" |  isocode=="LT" |  isocode=="EE" 
save "${output}/LCS_`type'.dta", replace 
keep year sss iso guo sin_`var0' sin_`var0'_ko yn_`var0'_ko
sort year sss iso guo 
order year sss guo iso sin_`var0' sin_`var0'_ko yn_`var0'_ko 
export delimited using "${output}/LCS_`type'", novarnames replace



*Labeling variables of interest 
*----------------------------------------------
use `LCS', clear
label var sin_emp_ko "sin_emp/yn_emp (den KLEMS/OECD)"
label var sin_va_ko "sin_va/yn_va (den KLEMS/OECD)"
label var sin_`var0'_ko "sin_`var0'/yn (den KLEMS/OECD)"
label var sin_`var0'_local_ko "sin_`var0'/ynn (den KLEMS/OECD)"
label var sin_`var0'_exp_ko "(sin_`var0'-sin_exp)/yn_`var0'_exp_ko (den KLEMS/OECD/WIOT)"
label var sin_`var0' "`var0' of i MNCs operating in n (cond operating in i)"
label var sin_emp "emp of i MNCs operating in n (cond operating in i)"
label var sin_exp "exports of i MNCs operating in n (cond operating in i)"
label var sin_va "value added of i MNCs operating in n (cond operating in i)"
label var sin_labcost "Labor cost of i MNCs operating in n (cond operating in i)"

label var yn_`var0'_ko "total `var0' in country n"
label var yn_emp_ko "total emp in country n"
label var yn_va_ko "total va in country n"
label var yn_labcost_ko "total labor cost in country n"
label var yn_exp_ko "total exports in country n"
label var ynn_`var0'_ko "total `var0' of local firms in country n"
label var yn_`var0'_exp_ko "total `var0' (minus exports) in country n"

compress


*Saving the file
*----------------------------------------------
drop if year==.
order year sector1 sector `industry' hq isocode sin_`var0'_ko sin_`var0'_local_ko sin_`var0'_exp_ko 
compress
tempfile sin_firmlevel_`type'
save `sin_firmlevel_`type'', replace
}
etime 



*Check for outliers
*----------------------------------------------
display "Drop foreing affiliates below 1MM USD and Parents below 5MM USD"
clear all
local whattype naics
use `sin_firmlevel_`whattype'', clear

sum sin_`var0', d
sum sin_`var0' if sin_`var0'<1 
tab isocode if sin_`var0'>=1
display "Drop firms below 1 MILLION USD" 

duplicates drop 
compress 
tempfile sample_firms
save `sample_firms', replace


*===============================================================================
*Temporary check on gravity variables
*===============================================================================
use "${data}/gravdata_00_16.dta", clear
keep if year==2016
replace year=2017
tempfile temp
save `temp', replace

use "${data}/gravdata_00_16.dta", clear
append using `temp'
tempfile gravdata_00_17
save `gravdata_00_17', replace



*Additions: include gravity variables and a dummy of constant MNC across years  
*----------------------------------------------
clear all
set more off

etime, start 
foreach type in `types' {

display "`type'"

if "`type'"=="naics4" {
local industry NAICS4 
}
if "`type'"=="naics3" {
local industry NAICS3 
}
else {
local industry sector
}

*Bring gravity variables 
*-----------------------------------------------
use `sin_firmlevel_`type'', clear

gen iso2_o=hq 
gen iso2_d=isocode
local trade_var distw fta_wto contig comlang_off colony  pop_o pop_d gdp_o gdp_d gdpcap_o gdpcap_d   
merge m:1 iso2_o iso2_d year using `gravdata_00_17', keepusing(`trade_var')

drop if _merge==2
drop _merge
foreach var in iso2_o iso2_d {
encode `var', gen(`var'b)
drop `var'
rename `var'b `var'
}
*
gen distance=ln(distw)
drop distw
drop iso2_o iso2_d
rename fta_wto RTA
label var distance "ln distance" 
label var RTA "Regional Trade Aggrement" 
label var comlang_off "Common Language" 
label var contig "commnon Border" 
label var colony "Former Colony" 
compress


*Flag MNCs number of years it has information (out of yy years)
*-----------------------------------------------
by sector1 sector guo_bvd isocode year, sort: gen a=_n==1
replace a=0 if year<2010
by sector1 sector guo_bvd isocode, sort: egen years_after2010=total(a)
drop a
by sector1 sector guo_bvd isocode year, sort: gen a=_n==1
by sector1 sector guo_bvd isocode, sort: egen years_allperiod=total(a)
drop a
gen sample_const=1
compress
save  "${data}/sin_firmlevel_`type'_gravity${lf}.dta", replace
}
etime

*log close 

